
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css">
    <title>Awesome Diffusion Transformers</title>
    <link rel="icon" type="image/png" href="assets/smiley.png">
    <link href="./assets/style.css" rel="stylesheet">
    <style>
        .title-with-icon-container {
            display: flex;
            justify-content: center; /* Center the container's content */
            align-items: center;
            position: relative; /* Needed for absolute positioning of the icon */
        }
        .title {
            display: inline; /* Display inline for tighter control over positioning */
        }
        .github-link {
            margin-left: 10px; /* Small gap between the title and the icon, adjust as needed */
            font-size: 24px; /* Adjust size as needed */
            color: black; /* Icon color */
            vertical-align: middle; /* Align with the middle of the text if needed */
        }
        table {
            border-collapse: collapse;
            width: 100%;
        }
        th, td {
            text-align: center;
            padding: 8px;
        }
        td:first-child, th:first-child {
            text-align: left;
        }
        th#dateHeader {
            min-width: 110px; /* Adjust as needed */
            cursor: pointer; /* Change cursor on hover */
            position: relative; /* Allows absolute positioning of sort icons */
        }
        /* Initial grey arrows */
        th#dateHeader::after, th#dateHeader::before {
            font-size: 0.8em;
            color: grey; /* Grey color by default */
            position: absolute;
            top: 50%;
            transform: translateY(-50%);
        }
        th#dateHeader::after {
            content: "▲";
            right: 5px; /* Adjust as needed */
        }
        th#dateHeader::before {
            content: "▼";
            right: 20px; /* Adjust as needed */
        }
        /* Active state styles - ensure specificity to override */
        th#dateHeader.active.asc::after, th#dateHeader.active.desc::before {
            color: black; /* Darker color for active state */
        }
    </style>
    <script>
        // Initial sort direction
        let sortDirection = 'asc';

        function sortTableByDate() {
            var table, rows, switching, i, x, y, shouldSwitch;
            table = document.getElementById("myTable");
            switching = true;

            while (switching) {
                switching = false;
                rows = table.rows;

                for (i = 1; i < rows.length - 1; i++) {
                    shouldSwitch = false;

                    x = rows[i].getElementsByTagName("TD")[1];
                    y = rows[i + 1].getElementsByTagName("TD")[1];

                    let date1 = parseDate(x.innerHTML.trim());
                    let date2 = parseDate(y.innerHTML.trim());

                    if (sortDirection === 'asc' && date1 > date2) {
                        shouldSwitch = true;
                        break;
                    } else if (sortDirection === 'desc' && date1 < date2) {
                        shouldSwitch = true;
                        break;
                    }
                }

                if (shouldSwitch) {
                    rows[i].parentNode.insertBefore(rows[i + 1], rows[i]);
                    switching = true;
                }
            }

            // Toggle the direction for next sort
            sortDirection = sortDirection === 'asc' ? 'desc' : 'asc';
            updateSortIndicator();
        }

        function parseDate(dateStr) {
            // Parses date strings in the format "28 Dec 2022" to Date objects
            const parts = dateStr.match(/(\d{1,2}) (\w+) (\d{4})/);
            return new Date(parts[3], 'JanFebMarAprMayJunJulAugSepOctNovDec'.indexOf(parts[2]) / 3, parts[1]);
        }

        function updateSortIndicator() {
            const header = document.querySelector("#dateHeader");
            // Remove previous states
            header.classList.remove("active", "asc", "desc");
            
            // Add 'active' class and current sort direction as class
            header.classList.add("active");
            if (sortDirection === 'asc') {
                header.classList.add("asc");
            } else {
                header.classList.add("desc");
            }
        }

        function filterTableByTask() {
            var filterValue, table, tr, td, i, img, matchFound;
            filterValue = document.getElementById("taskFilter").value.toUpperCase();
            table = document.getElementById("myTable");
            tr = table.getElementsByTagName("tr");

            for (i = 1; i < tr.length; i++) {
                tr[i].style.display = "none"; // Default to hiding rows, to be shown based on matching criteria
                td = tr[i].getElementsByTagName("td")[3]; // Assuming the Task is in the 4th column
                if (td) {
                    img = td.getElementsByTagName("img");
                    matchFound = false;
                    for (var j = 0; j < img.length; j++) {
                        var altText = img[j].alt.toUpperCase();
                        if (filterValue === "ALL") {
                            matchFound = true;
                            break;
                        } else if (filterValue === "OTHERS") {
                            if (["IMAGE", "VIDEO", "3D", "SPEECH"].indexOf(altText) === -1) {
                                matchFound = true;
                                break;
                            }
                        } else if (altText === filterValue) {
                            matchFound = true;
                            break;
                        }
                    }
                    if (matchFound) {
                        tr[i].style.display = ""; // Show row if a match is found
                    }
                }
            }
        }
    </script>
</head>
<body>

<div class="title-with-icon-container">
    <h1 class="title">Awesome Diffusion Transformers</h1>
    <a href="https://github.com/ShoufaChen/Awesome-Diffusion-Transformers" class="github-link" target="_blank">
        <i class="fab fa-github"></i>
    </a>
</div>

<div class="content">
    <table id="myTable" border="1">
        <tr>
            <th>Title</th>
            <th id="dateHeader" class="sort-indicator" onclick="sortTableByDate()">Date</th>
            <th>Venue</th>
            <th>
                Task
                <select id="taskFilter" onchange="filterTableByTask()">
                    <option value="all">All Tasks</option>
                    <option value="Image">Image</option>
                    <option value="Video">Video</option>
                    <option value="3D">3D</option>
                    <option value="Speech">Speech</option>
                    <!-- Add other task types as needed -->
                    <option value="Others">Others</option>
                </select>
            </th>
            <th>Resource</th>
        </tr>
<tr><td><a href="https://arxiv.org/abs/2208.15001">MotionDiffuse: Text-Driven Human Motion Generation with Diffusion Model</a></td><td>31 Aug 2022</td><td>TPAMI'2024</td><td><img src="./assets/others.svg" alt="Others"/> </td><td><a href="https://mingyuan-zhang.github.io/projects/MotionDiffuse.html"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/mingyuan-zhang/MotionDiffuse"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2209.12152">All are Worth Words: A ViT Backbone for Diffusion Models</a></td><td>25 Sep 2022</td><td>CVPR'2023</td><td><img src="./assets/image.svg" alt="Image"/> </td><td><a href="https://github.com/baofff/U-ViT"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2209.12892">Learning to Learn with Generative Models of Neural Network Checkpoints</a></td><td>26 Sep 2022</td><td>arXiv</td><td><img src="./assets/others.svg" alt="Others"/> </td><td><a href="https://www.wpeebles.com/Gpt.html"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/wpeebles/G.pt"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2212.09748">Scalable Diffusion Models with Transformers</a></td><td>19 Dec 2022</td><td>ICCV'2023</td><td><img src="./assets/image.svg" alt="Image"/> </td><td><a href="https://www.wpeebles.com/DiT"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/facebookresearch/DiT"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2212.13771">Exploring Vision Transformers as Diffusion Learners</a></td><td>28 Dec 2022</td><td>arXiv</td><td><img src="./assets/image.svg" alt="Image"/> </td><td></td></tr>
<tr><td><a href="https://arxiv.org/abs/2303.03755">DLT: Conditioned layout generation with Joint Discrete-Continuous Diffusion Layout Transformer</a></td><td>07 Mar 2023</td><td>ICCV'2023</td><td><img src="./assets/others.svg" alt="Others"/> </td><td><a href="https://wix-incubator.github.io/DLT/"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/wix-incubator/DLT"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2303.14389">Masked Diffusion Transformer is a Strong Image Synthesizer</a></td><td>25 Mar 2023</td><td>ICCV'2023</td><td><img src="./assets/image.svg" alt="Image"/> </td><td><a href="https://github.com/sail-sg/MDT"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://openreview.net/pdf?id=hRHX6XW9_Gu">Diffusion Transformer for Adaptive Text-to-Speech</a></td><td>03 May 2023</td><td>Interspeech'2023</td><td><img src="./assets/speech.svg" alt="Speech"/> </td><td><a href="https://recherchetts.github.io/dit/"><img src="./assets/website.svg" alt="Website"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2305.13311">VDT: General-purpose Video Diffusion Transformers via Mask Modeling</a></td><td>22 May 2023</td><td>ICLR'2024</td><td><img src="./assets/video.svg" alt="Video"/> </td><td><a href="https://vdt-2023.github.io/"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/RERV/VDT"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2305.12708">ViT-TTS: Visual Text-to-Speech with Scalable Diffusion Transformer</a></td><td>22 May 2023</td><td>EMNLP'2023</td><td><img src="./assets/speech.svg" alt="Speech"/> </td><td><a href="https://vit-tts.github.io/"><img src="./assets/website.svg" alt="Website"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2305.13195">U-DiT TTS: U-Diffusion Vision Transformer for Text-to-Speech</a></td><td>22 May 2023</td><td>arXiv</td><td><img src="./assets/speech.svg" alt="Speech"/> </td><td><a href="https://eihw.github.io/u-dit-tts/"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/EIHW/u-dit-tts"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2306.09305">Fast Training of Diffusion Models with Masked Transformers</a></td><td>15 Jun 2023</td><td>TMLR</td><td><img src="./assets/image.svg" alt="Image"/> </td><td><a href="https://github.com/Anima-Lab/MaskDiT"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2307.01831">DiT-3D: Exploring Plain Diffusion Transformers for 3D Shape Generation</a></td><td>04 Jul 2023</td><td>NeurIPS'2023</td><td><img src="./assets/3d.svg" alt="3D"/> </td><td><a href="https://diffusion-transformers-3d.github.io/"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/DiT-3D/DiT-3D"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2309.07920">Large-Vocabulary 3D Diffusion Model with Transformer</a></td><td>14 Sep 2023</td><td>ICLR'2024</td><td><img src="./assets/3d.svg" alt="3D"/> </td><td><a href="https://ziangcao0312.github.io/difftf_pages/"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/ziangcao0312/DiffTF"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2309.08251">Cartoondiff: Training-free Cartoon Image Generation with Diffusion Transformer Models</a></td><td>15 Sep 2023</td><td>arXiv</td><td><img src="./assets/image.svg" alt="Image"/> </td><td><a href="https://cartoondiff.github.io/"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/CartoonDiff/CartoonDiff"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2310.00426">PixArt-α: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis</a></td><td>30 Sep 2023</td><td>ICLR'2024</td><td><img src="./assets/image.svg" alt="Image"/> </td><td><a href="https://pixart-alpha.github.io/"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/PixArt-alpha/PixArt-alpha"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2310.16305">Dolfin: Diffusion Layout Transformers without Autoencoder</a></td><td>25 Oct 2023</td><td>arXiv</td><td><img src="./assets/others.svg" alt="Others"/> </td><td></td></tr>
<tr><td><a href="https://www.amazon.science/publications/mapache-masked-parallel-transformer-for-advanced-speech-editing-and-synthesis">Mapache: Masked parallel transformer for advanced speech editing and synthesis</a></td><td>03 Dec 2023</td><td>ICASSP'2024</td><td><img src="./assets/speech.svg" alt="Speech"/> </td><td></td></tr>
<tr><td><a href="https://arxiv.org/abs/2312.02139">DiffiT: Diffusion Vision Transformers for Image Generation</a></td><td>04 Dec 2023</td><td>arXiv</td><td><img src="./assets/image.svg" alt="Image"/> </td><td><a href="https://github.com/NVlabs/DiffiT"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2312.04557">GenTron: Delving Deep into Diffusion Transformers for Image and Video Generation</a></td><td>07 Dec 2023</td><td>CVPR'2024</td><td><img src="./assets/image.svg" alt="Image"/> <img src="./assets/video.svg" alt="Video"/> </td><td><a href="https://www.shoufachen.com/gentron_website"><img src="./assets/website.svg" alt="Website"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2312.06662">Photorealistic Video Generation with Diffusion Models</a></td><td>11 Dec 2023</td><td>arXiv</td><td><img src="./assets/video.svg" alt="Video"/> </td><td><a href="https://walt-video-diffusion.github.io/"><img src="./assets/website.svg" alt="Website"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2312.06400">DiT-Head: High-Resolution Talking Head Synthesis using Diffusion Transformers</a></td><td>11 Dec 2023</td><td>arXiv</td><td><img src="./assets/others.svg" alt="Others"/> </td><td></td></tr>
<tr><td><a href="https://arxiv.org/abs/2312.07231">Fast Training of Diffusion Transformer with Extreme Masking for 3D Point Clouds Generation</a></td><td>12 Dec 2023</td><td>arXiv</td><td><img src="./assets/3d.svg" alt="3D"/> </td><td><a href="https://dit-3d.github.io/FastDiT-3D/"><img src="./assets/website.svg" alt="Website"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2312.08568">NViST: In the Wild New View Synthesis from a Single Image with Transformers</a></td><td>13 Dec 2023</td><td>arXiv</td><td><img src="./assets/others.svg" alt="Others"/> </td><td><a href="https://wbjang.github.io/nvist_webpage/"><img src="./assets/website.svg" alt="Website"/></a> </td></tr>
<tr><td><a href="https://link.springer.com/chapter/10.1007/978-981-99-8552-4_20">TransDDPM: Transformer-Based Denoising Diffusion Probabilistic Model for Image Restoration</a></td><td>28 Dec 2023</td><td>PRCV'2023</td><td><img src="./assets/image.svg" alt="Image"/> </td><td></td></tr>
<tr><td><a href="https://arxiv.org/abs/2401.03048">Latte: Latent Diffusion Transformer for Video Generation</a></td><td>05 Jan 2024</td><td>arXiv</td><td><img src="./assets/video.svg" alt="Video"/> </td><td><a href="https://maxin-cn.github.io/latte_project/"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/Vchitect/Latte"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2401.05252">PIXART-δ: Fast and Controllable Image Generation with Latent Consistency Models</a></td><td>10 Jan 2024</td><td>arXiv</td><td><img src="./assets/image.svg" alt="Image"/> </td><td><a href="https://pixart-alpha.github.io/"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/PixArt-alpha/PixArt-alpha"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2401.08740">SiT: Exploring Flow and Diffusion-based Generative Models with Scalable Interpolant Transformers</a></td><td>16 Jan 2024</td><td>arXiv</td><td><img src="./assets/image.svg" alt="Image"/> </td><td><a href="https://scalable-interpolant.github.io/"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/willisma/SiT"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2401.11605">Scalable High-Resolution Pixel-Space Image Synthesis with Hourglass Diffusion Transformers</a></td><td>21 Jan 2024</td><td>arXiv</td><td><img src="./assets/image.svg" alt="Image"/> </td><td><a href="https://crowsonkb.github.io/hourglass-diffusion-transformers/"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/crowsonkb/k-diffusion"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2402.01516">Cross-view Masked Diffusion Transformers for Person Image Synthesis</a></td><td>02 Feb 2024</td><td>arXiv</td><td><img src="./assets/image.svg" alt="Image"/> </td><td></td></tr>
<tr><td><a href="https://arxiv.org/abs/2402.06656">DiffsFormer: A Diffusion Transformer on Stock Factor Augmentation</a></td><td>05 Feb 2024</td><td>arXiv</td><td><img src="./assets/others.svg" alt="Others"/> </td><td></td></tr>
<tr><td><a href="https://openai.com/research/video-generation-models-as-world-simulators">Sora</a></td><td>15 Feb 2024</td><td>OpenAI</td><td><img src="./assets/image.svg" alt="Image"/> <img src="./assets/video.svg" alt="Video"/> </td><td><a href="https://openai.com/sora"><img src="./assets/website.svg" alt="Website"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2402.11588">SDiT: Spiking Diffusion Model with Transformer</a></td><td>18 Feb 2024</td><td>arXiv</td><td><img src="./assets/image.svg" alt="Image"/> </td><td></td></tr>
<tr><td><a href="https://arxiv.org/abs/2402.12376">FiT: Flexible Vision Transformer for Diffusion Model</a></td><td>19 Feb 2024</td><td>arXiv</td><td><img src="./assets/image.svg" alt="Image"/> </td><td><a href="https://github.com/whlzy/FiT"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://stability.ai/news/stable-diffusion-3">Stable Diffusion 3</a></td><td>22 Feb 2024</td><td>Stability AI</td><td><img src="./assets/image.svg" alt="Image"/> </td><td><a href="https://stability.ai/news/stable-diffusion-3"><img src="./assets/website.svg" alt="Website"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2402.14797">Snap Video: Scaled Spatiotemporal Transformers for Text-to-Video Synthesis</a></td><td>22 Feb 2024</td><td>arXiv</td><td><img src="./assets/video.svg" alt="Video"/> </td><td><a href="https://snap-research.github.io/snapvideo/"><img src="./assets/website.svg" alt="Website"/></a> </td></tr>
<tr><td><a href="https://github.com/NUS-HPC-AI-Lab/OpenDiT">OpenDiT</a></td><td>26 Feb 2024</td><td>GitHub</td><td><img src="./assets/image.svg" alt="Image"/> <img src="./assets/video.svg" alt="Video"/> </td><td><a href="https://github.com/NUS-HPC-AI-Lab/OpenDiT"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://arxiv.org/abs/2402.18331">FineDiffusion: Scaling up Diffusion Models for Fine-grained Image Generation with 10,000 Classes</a></td><td>28 Feb 2024</td><td>arXiv</td><td><img src="./assets/image.svg" alt="Image"/> </td><td><a href="https://finediffusion.github.io/"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/FineDiffusion/FineDiffusion"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>
<tr><td><a href="https://pku-yuangroup.github.io/Open-Sora-Plan/">Open-Sora-Plan</a></td><td>01 Mar 2024</td><td>GitHub</td><td><img src="./assets/video.svg" alt="Video"/> </td><td><a href="https://pku-yuangroup.github.io/Open-Sora-Plan/"><img src="./assets/website.svg" alt="Website"/></a> <a href="https://github.com/PKU-YuanGroup/Open-Sora-Plan"><img src="./assets/code.svg" alt="Code"/></a> </td></tr>

</table>
</div>
</body>
</html>
